## import libraries
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import os
from math import ceil
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm as cm
import umap
import umap.plot
from sklearn.base import clone
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, PowerTransformer, RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift, DBSCAN, estimate_bandwidth
from sklearn.neighbors import NearestNeighbors
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
import graphviz
import warnings
warnings.filterwarnings('ignore')
# https://stackoverflow.com/questions/56618739/matplotlib-throws-warning-message-because-of-findfont-python
import logging
logging.getLogger('matplotlib.font_manager').disabled = True
# importing sompy enables more logging
logging.disable(logging.INFO)
import sompy
from sompy.visualization.mapview import View2D
from sompy.visualization.bmuhits import BmuHitsView
from sompy.visualization.hitmap import HitMapView
#!pip install kneed
from kneed import KneeLocator
## Note versions used
print('Pandas version ',pd.__version__)
print('Numpy version ',np.__version__)
print('Scipy version ',sp.__version__)
print('Seaborn version ',sns.__version__)
Pandas version 1.3.3 Numpy version 1.20.3 Scipy version 1.7.0 Seaborn version 0.11.2
COLORS = ['#7a2b5d', '#975c86', '#b38dad', '#d0bed6', '#edf0ff', '#eae1cd', '#e8d19a', '#e6c064', '#e3ae29']
CAT_COLORS = ['#7a2b5d', '#EBBD53', '#4475B0', '#21BFAB', '#EF5072']
CONTRAST_COLORS = [COLORS[i] for i in range(len(COLORS)) if (i%2) == 0]
DIV_COLORS = COLORS.copy()
DEFAULT_PALETTE = sns.color_palette(COLORS)
CONTRAST_PALETTE = sns.color_palette(CONTRAST_COLORS)
DIVERGENT_PALETTE = sns.color_palette(DIV_COLORS)
CAT_PALETTE = sns.color_palette(CAT_COLORS)
DIV_CMAP = LinearSegmentedColormap.from_list("div_colors", DIV_COLORS)
CAT_CMAP = LinearSegmentedColormap.from_list("cat_colors", CAT_COLORS)
SHOW_PLOTS = True
SAVE_PLOTS = True
RANDOM_STATE = 0
sns.palplot(DEFAULT_PALETTE)
sns.palplot(CONTRAST_PALETTE)
sns.palplot(DIVERGENT_PALETTE)
sns.palplot(CAT_PALETTE)
plt.show()
sns.set(style="white")
sns.set_context("paper")
sns.set_palette(DEFAULT_PALETTE)
plt.rcParams['figure.dpi'] = 70
show_plots = True
img_counter = 0
random_state = 0
IMG_PATH = '../../out/imgs/'
def save_fig(title, fig):
if SAVE_PLOTS == True:
fn = IMG_PATH + title.replace(' ','-') + '.png'
fig.savefig(fn, bbox_inches='tight')
## Function to plot histograms of numeric features for specified dataframe
def plot_histograms_boxplots(df, features, rows=4, title = "Histograms of Numeric Variables"):
if SHOW_PLOTS:
cols = ceil(len(features) / rows)
fig = plt.figure(figsize=(4*cols,4*rows),
constrained_layout=True)
subfigs = fig.subfigures(rows, cols)
for subf, feat in zip(subfigs.flatten(), features):
axs = subf.subplots(2, 1, sharex=True, \
gridspec_kw={'height_ratios': [4,1]})
axs[0].hist(df[feat], color=COLORS[0])
axs[0].set_ylabel(None)
axs[0].set_title(feat, y=1, fontsize=6*rows)
axs[1].set_xlabel(None)
flierprops = dict(markerfacecolor='None', markersize=6, markeredgecolor=COLORS[0])
sns.boxplot(x=df[feat], ax=axs[1], color=COLORS[0], flierprops=flierprops)
axs[1].set_xlabel(None)
subf.suptitle(None)
plt.suptitle(title, fontsize=8*rows)
if SAVE_PLOTS:
save_fig(title, fig)
plt.show()
else:
print("show_plots is currently set to False")
def getIQR(df, colname) :
q25 = df[colname].quantile(.25)
q75 = df[colname].quantile(.75)
iqr = (q75 - q25)
upper_lim = q75 + 2 * iqr
lower_lim = q25 - 2 * iqr
above_ul = df.loc[df[colname]>upper_lim]
below_ll = df.loc[df[colname]<lower_lim]
if len(above_ul) > 0 :
print(str(len(above_ul)) + " or " + str(round((100*len(above_ul)/len(df)),4)) + "% of rows are above the UL ["+ colname + "].")
if len(below_ll) > 0 :
print(str(len(below_ll)) + " or " + str(round((100*len(below_ll)/len(df)),4)) + "% of rows are below the LL ["+ colname + "].")
return upper_lim, lower_lim, len(above_ul), len(below_ll)
df = pd.read_excel('../data/WonderfulWinesoftheWorld.xlsx')
## Remove last row
df.drop(df.tail(1).index,inplace=True)
df['Custid'] = df['Custid'].astype(int)
df.set_index('Custid', inplace=True)
df_original = df.copy(deep=True)
df.head(3)
| Dayswus | Age | Edu | Income | Freq | Recency | Monetary | LTV | Perdeal | Dryred | Sweetred | Drywh | Sweetwh | Dessert | Exotic | WebPurchase | WebVisit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Custid | |||||||||||||||||
| 5325 | 653.0 | 55.0 | 20.0 | 78473.0 | 20.0 | 18.0 | 826.0 | 445.0 | 7.0 | 67.0 | 4.0 | 26.0 | 2.0 | 1.0 | 1.0 | 36.0 | 5.0 |
| 3956 | 1041.0 | 75.0 | 18.0 | 105087.0 | 36.0 | 33.0 | 1852.0 | 539.0 | 2.0 | 49.0 | 0.0 | 46.0 | 1.0 | 3.0 | 0.0 | 20.0 | 4.0 |
| 3681 | 666.0 | 18.0 | 12.0 | 27984.0 | 4.0 | 56.0 | 39.0 | -7.0 | 88.0 | 4.0 | 29.0 | 14.0 | 32.0 | 21.0 | 48.0 | 60.0 | 8.0 |
df.describe().to_csv('../../out/data/data_summary.csv')
df.dtypes
Dayswus float64 Age float64 Edu float64 Income float64 Freq float64 Recency float64 Monetary float64 LTV float64 Perdeal float64 Dryred float64 Sweetred float64 Drywh float64 Sweetwh float64 Dessert float64 Exotic float64 WebPurchase float64 WebVisit float64 dtype: object
All variables are numeric.
print(df[df.duplicated(keep=False)])
Empty DataFrame Columns: [Dayswus, Age, Edu, Income, Freq, Recency, Monetary, LTV, Perdeal, Dryred, Sweetred, Drywh, Sweetwh, Dessert, Exotic, WebPurchase, WebVisit] Index: []
## Wine preference features
wine_features = ['Dryred', 'Sweetred', 'Drywh', 'Sweetwh', 'Dessert', 'Exotic']
## Value segmentation features
value_features = ['Dayswus', 'Freq', 'Recency', 'Monetary', 'LTV', 'Perdeal']
## Separate Demographic features for cluster descriptions later
demog_features = ['Age', 'Edu', 'Income']
## Separate other features
other_features = ['WebPurchase', 'WebVisit']
plot_histograms_boxplots(df, value_features, rows=2, title='Distribution of Value Segmentation Features')
plot_histograms_boxplots(df, wine_features, rows=2, title='Distribution of Wine Segmentation Features')
plot_histograms_boxplots(df, (demog_features+other_features), rows=2, title='Distribution of Other Features')
Because of it's unusual distribution
fig, axes = plt.subplots(2,round(len(value_features)/2),figsize=(15,9), sharex=True)
for f, ax in zip(range(len(value_features)),axes.flatten()):
sns.scatterplot(data=df, x='Recency', y=value_features[f],
alpha=.25, s=80, marker='x', palette=CAT_CMAP,
ax=ax
)
plt.suptitle('Recency vs Value Features', fontsize=20)
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(2,round(len(wine_features)/2),figsize=(15,9), sharex=True)
for f, ax in zip(range(len(wine_features)),axes.flatten()):
sns.scatterplot(data=df, x='Recency', y=wine_features[f],
alpha=.25, s=80, marker='x', palette=CAT_CMAP,
ax=ax
)
plt.suptitle('Recency vs Wine Features', fontsize=20)
plt.tight_layout()
plt.show()
do_features = demog_features+other_features
fig, axes = plt.subplots(2,round(len(do_features)/2),figsize=(15,9), sharex=True)
for f, ax in zip(range(len(do_features)),axes.flatten()):
sns.scatterplot(data=df, x='Recency', y=do_features[f],
alpha=.25, s=80, marker='x', palette=CAT_CMAP,
ax=ax
)
plt.suptitle('Recency vs Other Features', fontsize=20)
plt.tight_layout()
plt.show()
Consider the points at Recency > 100 as "Lost Customers" in its own cluster. Observing in particular the relationship between Recency and Frequency, it appears that customers are unlikely to repurchase if their last transaction has been more than 100 days past.
som_vars = value_features + wine_features + demog_features + other_features
## From Lab 11
np.random.seed(RANDOM_STATE)
sm = sompy.SOMFactory().build(
df[som_vars].values,
mapsize=[15,15],
initialization='random',
neighborhood='gaussian',
training='batch',
lattice='hexa',
component_names=som_vars
)
sm.train(n_job=4, train_rough_len=50, train_finetune_len=50)
# Visualizing the Component planes (feature values)
view2D = View2D(2, 2, "", text_size=20)
view2D.show(sm, col_sz=5, what='codebook')
plt.suptitle("Component Planes", fontsize=20,y=.9)
if SAVE_PLOTS:
save_fig('Component Planes', fig)
plt.show()
def make_corr_heatmap(df, method, title="Triangle Correlation Heatmap"):
l = len(df.columns.tolist())
fig = plt.figure(figsize=(2*l, 1.5*l))
mask = np.triu(np.ones_like(df.corr(method=method), dtype=bool))
heatmap = sns.heatmap(df.corr(method=method), mask=mask, vmin=-1, vmax=1, annot=True, fmt='.2f' ,cmap=DIV_CMAP)
heatmap.set_title(title, fontdict={'fontsize':18}, pad=2);
save_fig(title, fig)
plt.show()
make_corr_heatmap(df[value_features], 'spearman', title="Spearman Correlation Heatmap: Value Features")
make_corr_heatmap(df[value_features], 'pearson', title="Pearson Correlation Heatmap: Value Features")
## Remove Perdeal because highly correlated with multiple feats
value_features.remove('Perdeal')
other_features.append('Perdeal')
## Remove Monetary because highly correlated with Freq, LTV
value_features.remove('Monetary')
other_features.append('Monetary')
## Remove Frequency and LTV because highly correlated with Monetary
value_features2 = ['Dayswus', 'Recency', 'Monetary']
value_features
['Dayswus', 'Freq', 'Recency', 'LTV']
make_corr_heatmap(df[value_features], 'spearman', title="Spearman Correlation: Value Features (reduced)")
make_corr_heatmap(df[wine_features], 'spearman', title="Spearman Correlation Heatmap: Wine Features")
make_corr_heatmap(df[wine_features], 'pearson', title="Pearson Correlation Heatmap: Wine Features")
for v in df.columns.tolist():
getIQR(df, v)
363 or 3.63% of rows are above the UL [Recency]. 117 or 1.17% of rows are above the UL [LTV]. 358 or 3.58% of rows are above the UL [Sweetred]. 2 or 0.02% of rows are above the UL [Drywh]. 365 or 3.65% of rows are above the UL [Sweetwh]. 474 or 4.74% of rows are above the UL [Dessert]. 331 or 3.31% of rows are above the UL [Exotic].
Too many outliers to remove just based on IQR
df['Outlier'] = 0
df.loc[df['Dessert']>70,['Outlier']] = 1
df.loc[df['Sweetred']>70,['Outlier']] = 1
df.loc[df['Freq']>55,['Outlier']] = 1
df.loc[df['LTV']>1750,['Outlier']] = 1
df_hasoutliers = df.copy()
df = df[df['Outlier']==0]
df_outliers = df[df['Outlier']==1]
plot_histograms_boxplots(df, value_features, rows=2, title='Value Segmentation (Outliers Removed)')
plot_histograms_boxplots(df, wine_features, rows=2, title='Wine Segmentation (Outliers Removed)')
df = df.loc[df['Recency']<=100,:]
We only scale the Value Segmentation Features because the Wine Segmentation Features are all in the same scale (percentage) already.
mmscaler = MinMaxScaler()
value_feats_mm = value_features.copy()
for fi in range(len(value_feats_mm)):
t = value_feats_mm[fi] + '_mm'
value_feats_mm[fi] = t
df[t] = df[value_features[fi]]
df[value_feats_mm] = mmscaler.fit_transform(df[value_feats_mm])
plot_histograms_boxplots(df, value_feats_mm, rows=2, title='Value Variables with MinMax Transform')
wine_feats_dec = wine_features.copy()
for fi in range(len(wine_feats_dec)):
t = wine_feats_dec[fi] + '_dec'
wine_feats_dec[fi] = t
df[t] = df[wine_features[fi]]/100
vars_ = wine_feats_dec + value_feats_mm
neigh = NearestNeighbors(n_neighbors=50)
neigh.fit(df[vars_])
distances, _ = neigh.kneighbors(df[vars_])
distances = np.sort(distances[:, -1])
fig, axis = plt.subplots(figsize=(9,7))
plt.plot(distances, color=COLORS[0])
plt.title('K-distance graph')
plt.show()
save_fig('K-distance graph', fig)
# Perform DBSCAN clustering
dbscan = DBSCAN(eps=.35, min_samples=20, n_jobs=4)
dbscan_labels = dbscan.fit_predict(df[vars_])
dbscan_n_clusters = len(np.unique(dbscan_labels))
print("Number of estimated clusters : %d" % dbscan_n_clusters)
Number of estimated clusters : 2
# Concatenating the labels to df
df_dbscan = pd.concat([df[vars_], pd.Series(dbscan_labels, index=df.index, name="dbscan_labels")], axis=1)
df_dbscan.groupby(['dbscan_labels']).count()
| Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | Exotic_dec | Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | |
|---|---|---|---|---|---|---|---|---|---|---|
| dbscan_labels | ||||||||||
| -1 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 |
| 0 | 9517 | 9517 | 9517 | 9517 | 9517 | 9517 | 9517 | 9517 | 9517 | 9517 |
df_ = pd.concat([df, pd.Series(dbscan_labels, index=df.index, name="Noise")], axis=1)
df_noise = df_.loc[df_['Noise'] == -1]
df = df_.loc[df_['Noise'] != -1]
df_nonoise = df_.loc[df_['Noise'] != -1]
df.drop(columns=['Noise'], inplace=True)
df
| Dayswus | Age | Edu | Income | Freq | Recency | Monetary | LTV | Perdeal | Dryred | ... | Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | Exotic_dec | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Custid | |||||||||||||||||||||
| 5325 | 653.0 | 55.0 | 20.0 | 78473.0 | 20.0 | 18.0 | 826.0 | 445.0 | 7.0 | 67.0 | ... | 0.147143 | 0.358491 | 0.18 | 0.348824 | 0.67 | 0.04 | 0.26 | 0.02 | 0.01 | 0.01 |
| 3956 | 1041.0 | 75.0 | 18.0 | 105087.0 | 36.0 | 33.0 | 1852.0 | 539.0 | 2.0 | 49.0 | ... | 0.701429 | 0.660377 | 0.33 | 0.401456 | 0.49 | 0.00 | 0.46 | 0.01 | 0.03 | 0.00 |
| 3681 | 666.0 | 18.0 | 12.0 | 27984.0 | 4.0 | 56.0 | 39.0 | -7.0 | 88.0 | 4.0 | ... | 0.165714 | 0.056604 | 0.56 | 0.095745 | 0.04 | 0.29 | 0.14 | 0.32 | 0.21 | 0.48 |
| 2829 | 1049.0 | 42.0 | 16.0 | 61748.0 | 2.0 | 46.0 | 37.0 | -6.0 | 70.0 | 86.0 | ... | 0.712857 | 0.018868 | 0.46 | 0.096305 | 0.86 | 0.01 | 0.11 | 0.01 | 0.01 | 0.55 |
| 8788 | 837.0 | 47.0 | 16.0 | 65789.0 | 2.0 | 3.0 | 36.0 | 4.0 | 35.0 | 85.0 | ... | 0.410000 | 0.018868 | 0.03 | 0.101904 | 0.85 | 0.00 | 0.12 | 0.02 | 0.01 | 0.28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7989 | 774.0 | 43.0 | 18.0 | 42853.0 | 4.0 | 33.0 | 59.0 | -17.0 | 90.0 | 73.0 | ... | 0.320000 | 0.056604 | 0.33 | 0.090146 | 0.73 | 0.01 | 0.25 | 0.01 | 0.00 | 0.07 |
| 1383 | 1132.0 | 57.0 | 20.0 | 81033.0 | 19.0 | 59.0 | 776.0 | 187.0 | 22.0 | 78.0 | ... | 0.831429 | 0.339623 | 0.59 | 0.204367 | 0.78 | 0.00 | 0.20 | 0.01 | 0.01 | 0.11 |
| 4070 | 596.0 | 66.0 | 15.0 | 84714.0 | 18.0 | 45.0 | 720.0 | 391.0 | 5.0 | 30.0 | ... | 0.065714 | 0.320755 | 0.45 | 0.318589 | 0.30 | 0.12 | 0.36 | 0.10 | 0.12 | 0.13 |
| 7909 | 619.0 | 18.0 | 12.0 | 40466.0 | 3.0 | 65.0 | 47.0 | 5.0 | 23.0 | 6.0 | ... | 0.098571 | 0.037736 | 0.65 | 0.102464 | 0.06 | 0.24 | 0.10 | 0.38 | 0.22 | 0.41 |
| 4914 | 979.0 | 55.0 | 16.0 | 94926.0 | 25.0 | 28.0 | 1148.0 | 293.0 | 7.0 | 63.0 | ... | 0.612857 | 0.452830 | 0.28 | 0.263718 | 0.63 | 0.10 | 0.13 | 0.11 | 0.03 | 0.04 |
9517 rows × 28 columns
print('Percentage of rows considered noise:')
100*len(df_dbscan.loc[df_dbscan['dbscan_labels'] == -1])/len(df)
Percentage of rows considered noise:
0.34674792476620786
len(df_noise.loc[df_noise['Recency']>100,:])/len(df_noise)
0.0
## Original value features
print(value_features)
df[value_features].head(3)
['Dayswus', 'Freq', 'Recency', 'LTV']
| Dayswus | Freq | Recency | LTV | |
|---|---|---|---|---|
| Custid | ||||
| 5325 | 653.0 | 20.0 | 18.0 | 445.0 |
| 3956 | 1041.0 | 36.0 | 33.0 | 539.0 |
| 3681 | 666.0 | 4.0 | 56.0 | -7.0 |
## Original wine features
print(wine_features)
df[wine_features].head(3)
['Dryred', 'Sweetred', 'Drywh', 'Sweetwh', 'Dessert', 'Exotic']
| Dryred | Sweetred | Drywh | Sweetwh | Dessert | Exotic | |
|---|---|---|---|---|---|---|
| Custid | ||||||
| 5325 | 67.0 | 4.0 | 26.0 | 2.0 | 1.0 | 1.0 |
| 3956 | 49.0 | 0.0 | 46.0 | 1.0 | 3.0 | 0.0 |
| 3681 | 4.0 | 29.0 | 14.0 | 32.0 | 21.0 | 48.0 |
## MinMax transformed value features
print(value_feats_mm)
df[value_feats_mm].head(3)
['Dayswus_mm', 'Freq_mm', 'Recency_mm', 'LTV_mm']
| Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | |
|---|---|---|---|---|
| Custid | ||||
| 5325 | 0.147143 | 0.358491 | 0.18 | 0.348824 |
| 3956 | 0.701429 | 0.660377 | 0.33 | 0.401456 |
| 3681 | 0.165714 | 0.056604 | 0.56 | 0.095745 |
## Decimal wine features
print(wine_feats_dec)
df[wine_feats_dec].head(3)
['Dryred_dec', 'Sweetred_dec', 'Drywh_dec', 'Sweetwh_dec', 'Dessert_dec', 'Exotic_dec']
| Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | Exotic_dec | |
|---|---|---|---|---|---|---|
| Custid | ||||||
| 5325 | 0.67 | 0.04 | 0.26 | 0.02 | 0.01 | 0.01 |
| 3956 | 0.49 | 0.00 | 0.46 | 0.01 | 0.03 | 0.00 |
| 3681 | 0.04 | 0.29 | 0.14 | 0.32 | 0.21 | 0.48 |
bins = 5
rfm_df = df.loc[df['Recency']<=100,:]
rfm_df['Recency_Rank'] = bins - pd.qcut(rfm_df['Recency'], bins, labels=False)
rfm_df['Frequency_Rank'] = 1 + pd.qcut(rfm_df['Freq'], bins, labels=False)
rfm_df['Monetary_Rank'] = 1+ pd.qcut(rfm_df['Monetary'], bins, labels=False)
rfm_df['RFM_Ave'] = rfm_df.loc[:,['Recency_Rank','Frequency_Rank','Monetary_Rank']].mean(axis=1)
bins = 5
df['Recency_Rank'] = 0
df['Frequency_Rank'] = 0
df['Monetary_Rank'] = 0
df.loc[df['Recency']<=100,'Recency_Rank'] = bins - \
pd.qcut(df.loc[df['Recency']<=100,'Recency'], bins, labels=False)
df.loc[df['Recency']<=100,'Frequency_Rank'] = 1 + pd.qcut(df.loc[df['Recency']<=100,'Freq'], bins, labels=False)
df.loc[df['Recency']<=100,'Monetary_Rank'] = 1+ pd.qcut(df.loc[df['Recency']<=100,'Monetary'], bins, labels=False)
#df.loc[df['Recency']<=100,'RFM_Ave'] = df.loc[df['Recency']<=100,['Recency_Rank','Frequency_Rank','Monetary_Rank']].mean(axis=1)
df['RFM_Ave'] = df.loc[:,['Recency_Rank','Frequency_Rank','Monetary_Rank']].mean(axis=1)
df.loc[:,['Recency','Recency_Rank','Freq','Frequency_Rank','Monetary_Rank','RFM_Ave']]
| Recency | Recency_Rank | Freq | Frequency_Rank | Monetary_Rank | RFM_Ave | |
|---|---|---|---|---|---|---|
| Custid | ||||||
| 5325 | 18.0 | 5 | 20.0 | 4 | 4 | 4.333333 |
| 3956 | 33.0 | 4 | 36.0 | 5 | 5 | 4.666667 |
| 3681 | 56.0 | 3 | 4.0 | 1 | 1 | 1.666667 |
| 2829 | 46.0 | 3 | 2.0 | 1 | 1 | 1.666667 |
| 8788 | 3.0 | 5 | 2.0 | 1 | 1 | 2.333333 |
| ... | ... | ... | ... | ... | ... | ... |
| 7989 | 33.0 | 4 | 4.0 | 1 | 2 | 2.333333 |
| 1383 | 59.0 | 3 | 19.0 | 4 | 4 | 3.666667 |
| 4070 | 45.0 | 3 | 18.0 | 4 | 4 | 3.666667 |
| 7909 | 65.0 | 2 | 3.0 | 1 | 1 | 1.333333 |
| 4914 | 28.0 | 4 | 25.0 | 4 | 4 | 4.000000 |
9517 rows × 6 columns
rfm_rank_features = ['Frequency_Rank', 'Recency_Rank', 'Monetary_Rank', 'RFM_Ave']
rfm_features = ['Freq', 'Recency', 'Monetary',]
quantile_values = [rfm_df['Recency'].quantile(i) for i in [.2,.4,.6,.8]]
freq_quints = [rfm_df['Freq'].quantile(i) for i in [.2,.4,.6,.8]]
#quantile_values
#freq_quints
fig, ax = plt.subplots(figsize=(19,13))
for q in range(len(quantile_values)):
ax.axvline(quantile_values[q]+.49,
label=str(int(quantile_values[q]))+'th Percentile',
color='#CCCCCC')
#ax.axhline(freq_quints[q]+.5,
# label=str(int(quantile_values[q]))+'th Percentile',
# color='#CCCCCC')
scatter = sns.scatterplot(data=rfm_df, x='Recency', y='Freq', hue='Frequency_Rank',
alpha=.25, s=80, marker='o', palette=CAT_CMAP,
#style='Recency_Rank'
)
ax.set_ylabel('Frequency', fontsize=20)
ax.set_xlabel('Recency', fontsize=20)
ax.legend([])
rfm_title='Recency vs Frequency Quintiles'
plt.suptitle(rfm_title, fontsize=20, y=.91)
plt.margins(0.025, 0.05)
if SAVE_PLOTS:
save_fig(rfm_title, fig)
plt.show()
## Code from Lab #13
def get_ss(df):
"""Computes the sum of squares for all variables given a dataset
"""
ss = np.sum(df.var() * (df.count() - 1))
return ss # return sum of sum of squares of each df variable
def r2_(df, labels):
sst = get_ss(df)
ssw = np.sum(df.groupby(labels).apply(get_ss))
return 1 - ssw/sst
def r2(dft, df, labels, feats):
sst = get_ss(dft[feats])
ssw = np.sum(df.groupby(labels).apply(get_ss))
return 1 - ssw/sst
def get_r2_scores(df, clusterer, min_k=2, max_k=10):
"""
Loop over different values of k. To be used with sklearn clusterers.
"""
r2_clust = {}
for n in range(min_k, max_k):
clust = clone(clusterer).set_params(n_clusters=n)
labels = clust.fit_predict(df)
r2_clust[n] = r2_(df, labels)
return r2_clust
def showR2Plot(r2_scores, title=""):
# Visualizing the R² scores for each cluster solution on demographic variables
fig, ax = plt.subplots(figsize=(10,7))
pd.DataFrame(r2_scores).plot(ax=ax)
plt.title("R² Plot: " + title)
ax.set_xlabel("Number of clusters")
ax.set_ylabel("R² metric")
save_fig('R2_Plot_'+ title, fig)
plt.show()
def show_cluster_heatmap(df, label1, cols, title='Heatmap of Cluster Means'):
fig, ax = plt.subplots(constrained_layout=True , figsize=(len(cols)*1.4,len(cols)*.8), sharey=True)
label_counts = df.groupby(label1)[label1].count()
xticks = [(str(l) + "\n" + str(label_counts[l])) for l in range(len(label_counts))]
sns.heatmap(df.groupby(label1)[cols].mean().loc[range(0,len(label_counts)),:].T, \
vmin=0, vmax=1,
cmap=DIV_CMAP, xticklabels=xticks, annot=True, fmt='.2f', annot_kws={"fontsize":14})
k = len(df[label1].unique().tolist())
ax.set_xlabel(('Cluster Sizes and Labels, K = '+str(k)))
fig.suptitle(title, y=1.1, fontsize=20)
if SAVE_PLOTS:
save_fig(title, fig)
plt.show()
def compare_cluster_heatmap(df, k1, k2, label1, label2, cols, title='Heatmap of Cluster Means'):
fig, ax = plt.subplots(1,2,gridspec_kw={'width_ratios': [k1, k2]} ,\
constrained_layout=True , figsize=(14,len(cols)*.8), sharey=True)
label_counts = df.groupby(label1)[label1].count()
xticks = [(str(l) + "\n" + str(label_counts[l])) for l in range(len(label_counts))]
sns.heatmap(df.groupby(label1)[cols].mean().loc[range(0,k1),:].T, \
vmin=0, vmax=1,\
cmap=DIV_CMAP, ax=ax[0], xticklabels=xticks, annot=True, fmt='.2f', annot_kws={"fontsize":16})
label_counts = df.groupby(label2)[label2].count()
xticks = [(str(l) + "\n" + str(label_counts[l])) for l in range(len(label_counts))]
sns.heatmap(df.groupby(label2)[cols].mean().loc[range(0,k2),:].T, \
vmin=0, vmax=1,\
cmap=DIV_CMAP, ax=ax[1], xticklabels=xticks, annot=True, fmt='.2f', annot_kws={"fontsize":16})
ax[0].set_xlabel('Cluster Sizes and Labels, K='+str(k1))
ax[1].set_xlabel('Cluster Sizes and Labels, K='+str(k2))
fig.suptitle(title, y=1.1, fontsize=20)
save_fig(title, fig)
plt.show()
def show_elbow_silhouette(df, features, max_k=40):
n_clusters = range(2,max_k)
silhouette_scores = []
sum_squared_dist = []
r_scores = []
for num_clusters in n_clusters:
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=1500,n_init=100,random_state=RANDOM_STATE)
kmeans.fit(df[features])
cluster_labels = kmeans.labels_
sum_squared_dist.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(df[features], cluster_labels,\
metric='euclidean', \
sample_size=None, random_state=None))
df['labels'] = cluster_labels
r_scores.append(r2_(df, 'labels'))
fig, ax1 = plt.subplots(figsize=(11,7))
ax2 = ax1.twinx()
ax1.plot(n_clusters,sum_squared_dist, color=CAT_COLORS[0], label='Inertia')
#ax2.plot(n_clusters, r_scores, color=CAT_COLORS[2], label='R2' )
ax1.set_xlabel('Values of K')
ax1.set_ylabel('Inertia', color=COLORS[0])
#ax2.set_ylabel('R2 Score', color=CAT_COLORS[2])
#ax2.yaxis.set_label_coords(.97, 0.5)
kl = KneeLocator(range(1, len(sum_squared_dist)+1), sum_squared_dist, curve="convex", direction="decreasing")
ax1.axvline(x=kl.elbow, label='Elbow', color=CAT_COLORS[0], linestyle='dashed')
fig.legend(loc="center right", bbox_to_anchor=(.85, .5))
plt.xticks(range(1,max_k+1,2))
plt.title('Inertia Plot')
save_fig('Inertia R2 Plot', fig)
plt.show()
print('Knee located at k=', kl.elbow)
def plot_clusters(df, labels, title='Cluster Visualization'):
fig, ax = plt.subplots(figsize=(10,10))
scatter = ax.scatter(x=df[0], y=df[1], c=labels, cmap=CAT_CMAP,\
s=2, marker="o", alpha=.75, label='Final Clusters')
ncol = len(labels.unique().tolist())
ncol = round(ncol/2) + 1 if ncol > 7 else ncol
legend1 = ax.legend(*scatter.legend_elements(),
bbox_to_anchor=(.5,1,.5,1), loc="lower left", frameon=False,
mode='expand', borderaxespad=0, ncol=ncol)
ax.set_xticklabels('')
ax.set_yticklabels('')
plt.title(title, loc='left')
if SAVE_PLOTS:
save_fig(title, fig)
plt.show()
## Code based from Lab 09
def plot_dendrogram(model, title, **kwargs):
sns.set_palette(CAT_PALETTE)
# Create linkage matrix and then plot the dendrogram
fig, ax = plt.subplots(figsize=(11,5))
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
plt.hlines(kwargs['color_threshold'], 0, 1000, colors="r", linestyles="dashed")
plt.xticks(rotation=90)
plt.title(title)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
save_fig(title, fig)
plt.show()
sns.set_palette(DEFAULT_PALETTE)
## Scoring
r_scores_wine = pd.DataFrame(columns=['method','r2','clusters'])
r_scores_value = pd.DataFrame(columns=['method','r2','clusters'])
df_wine_kmeans = df_nonoise[wine_feats_dec].copy()
show_elbow_silhouette(df_wine_kmeans, wine_feats_dec, max_k=10)
Knee located at k= 3
show_elbow_silhouette(df_wine_kmeans, wine_feats_dec, max_k=20)
Knee located at k= 5
df_wk = df_wine_kmeans.copy()
for k in range(2,10):
wine_kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=1000, n_init=30, random_state=RANDOM_STATE)
wine_k_labels = wine_kmeans.fit_predict(df_wk)
df_wk['wine_'+str(k)] = wine_k_labels
r2_(df_wk[wine_feats_dec+['wine_'+str(k)]],'wine_'+str(k))
r_append = {
'method': 'kmeans',
'r2': r2_(df_wk[wine_feats_dec+['wine_'+str(k)]],'wine_'+str(k)),
'clusters':k
}
r_scores_wine = r_scores_wine.append(r_append, ignore_index=True)
compare_cluster_heatmap(df_wk, 3, 4, 'wine_3', 'wine_4', wine_feats_dec, 'Heatmap of Cluster Means, Wine Segmentation: K=3 vs K=4')
compare_cluster_heatmap(df_wk, 5, 6, 'wine_5', 'wine_6', wine_feats_dec, 'Heatmap of Cluster Means, Wine Segmentation: K=5 vs K=6')
r_scores_wine
| method | r2 | clusters | |
|---|---|---|---|
| 0 | kmeans | 0.689819 | 2 |
| 1 | kmeans | 0.871299 | 3 |
| 2 | kmeans | 0.903283 | 4 |
| 3 | kmeans | 0.950709 | 5 |
| 4 | kmeans | 0.961017 | 6 |
| 5 | kmeans | 0.978910 | 7 |
| 6 | kmeans | 0.982729 | 8 |
| 7 | kmeans | 0.987788 | 9 |
r_scores_wine.to_csv('../../out/data/r_scores_wine.csv')
wine_k = 3
wine_kmeans = KMeans(n_clusters=wine_k, init='k-means++', max_iter=1000000, n_init=30, random_state=RANDOM_STATE)
wine_k_labels = wine_kmeans.fit_predict(df_wine_kmeans)
df_wine_kmeans['wine_labels'] = wine_k_labels
r2_(df_wine_kmeans,'wine_labels')
0.9033591898983989
show_cluster_heatmap(df_wine_kmeans, 'wine_labels', wine_feats_dec, 'Cluster Means of Wine Segmentation: KMeans')
wine_tsne = TSNE(random_state=RANDOM_STATE).fit_transform(df_wine_kmeans[wine_feats_dec])
tsne_wine_df = pd.DataFrame(wine_tsne)
plot_clusters(tsne_wine_df, df_wine_kmeans['wine_labels'], 'T-SNE Visualization of Wine Segmentation')
wine_umap = umap.UMAP(random_state=RANDOM_STATE, metric='euclidean', min_dist=1, n_neighbors=150, n_components=2)\
.fit_transform(df_wine_kmeans[wine_feats_dec])
umap_wine_df = pd.DataFrame(wine_umap)
OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
plot_clusters(umap_wine_df, df_wine_kmeans['wine_labels'], 'UMAP Visualization of Wine Segmentation')
df_value_kmeans = df[value_feats_mm].copy()
df_value_hclust = df[value_feats_mm].copy()
show_elbow_silhouette(df_value_kmeans, value_feats_mm, max_k=20)
Knee located at k= 6
value_k = 4
value_kmeans = KMeans(n_clusters=value_k, init='k-means++', n_init=100, random_state=RANDOM_STATE)
value_k_labels = value_kmeans.fit_predict(df_value_kmeans)
df_value_kmeans['value_labels'] = value_k_labels
show_cluster_heatmap(df_value_kmeans, 'value_labels', value_feats_mm, 'Cluster Means of Value Segmentation: Kmeans')
r2_(df_value_kmeans,'value_labels')
0.9402739929946685
linkage = 'ward'
distance = 'euclidean'
value_hclust = AgglomerativeClustering(linkage=linkage, affinity=distance, n_clusters=value_k)
value_labels = value_hclust.fit_predict(df_value_hclust)
df_value_hclust['value_labels'] = value_labels
df_vh = df_value_kmeans.copy()
for k in range(2,10):
value_hclust = AgglomerativeClustering(linkage=linkage, affinity=distance, n_clusters=k)
value_h_labels = value_hclust.fit_predict(df_vh)
df_vh['value_'+str(k)] = value_h_labels
r_append = {
'method': 'hclust',
'r2': r2_(df_vh[value_feats_mm+['value_'+str(k)]],'value_'+str(k)),
'clusters':k
}
r_scores_value = r_scores_value.append(r_append, ignore_index=True)
#r_scores_value
df_vk = df_value_kmeans.copy()
for k in range(2,10):
value_kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=1000, n_init=30, random_state=RANDOM_STATE)
value_k_labels = value_kmeans.fit_predict(df_vk)
df_vk['value_'+str(k)] = value_k_labels
r_append = {
'method': 'kmeans',
'r2': r2_(df_vk[value_feats_mm+['value_'+str(k)]],'value_'+str(k)),
'clusters':k
}
r_scores_value = r_scores_value.append(r_append, ignore_index=True)
#r_scores_value
show_cluster_heatmap(df_value_hclust, 'value_labels', value_feats_mm, 'Cluster Means of Value Segmentation: Hierarchical')
value_tsne = TSNE(random_state=RANDOM_STATE).fit_transform(df_value_hclust[value_feats_mm])
tsne_value_df = pd.DataFrame(value_tsne)
plot_clusters(tsne_value_df, df_value_hclust['value_labels'], 'T-SNE Visualization of Value Segmentation')
value_umap = umap.UMAP(random_state=RANDOM_STATE, metric='euclidean', min_dist=1, n_neighbors=150, n_components=2)\
.fit_transform(df_value_hclust[value_feats_mm])
umap_value_df = pd.DataFrame(value_umap)
plot_clusters(umap_value_df, df_value_hclust['value_labels'], 'UMAP Visualization of Value Segmentation')
rfm_df[rfm_features]
| Freq | Recency | Monetary | |
|---|---|---|---|
| Custid | |||
| 5325 | 20.0 | 18.0 | 826.0 |
| 3956 | 36.0 | 33.0 | 1852.0 |
| 3681 | 4.0 | 56.0 | 39.0 |
| 2829 | 2.0 | 46.0 | 37.0 |
| 8788 | 2.0 | 3.0 | 36.0 |
| ... | ... | ... | ... |
| 7989 | 4.0 | 33.0 | 59.0 |
| 1383 | 19.0 | 59.0 | 776.0 |
| 4070 | 18.0 | 45.0 | 720.0 |
| 7909 | 3.0 | 65.0 | 47.0 |
| 4914 | 25.0 | 28.0 | 1148.0 |
9517 rows × 3 columns
df_merged = pd.merge(df_value_hclust, df_wine_kmeans, left_index=True, right_index=True)
df_merged = pd.merge(df_merged, rfm_df[rfm_features + rfm_rank_features], left_index=True, right_index=True)
df_merged
| Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | value_labels | Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | Exotic_dec | labels | wine_labels | Freq | Recency | Monetary | Frequency_Rank | Recency_Rank | Monetary_Rank | RFM_Ave | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Custid | ||||||||||||||||||||
| 5325 | 0.147143 | 0.358491 | 0.18 | 0.348824 | 3 | 0.67 | 0.04 | 0.26 | 0.02 | 0.01 | 0.01 | 4 | 0 | 20.0 | 18.0 | 826.0 | 4 | 5 | 4 | 4.333333 |
| 3956 | 0.701429 | 0.660377 | 0.33 | 0.401456 | 1 | 0.49 | 0.00 | 0.46 | 0.01 | 0.03 | 0.00 | 8 | 2 | 36.0 | 33.0 | 1852.0 | 5 | 4 | 5 | 4.666667 |
| 3681 | 0.165714 | 0.056604 | 0.56 | 0.095745 | 2 | 0.04 | 0.29 | 0.14 | 0.32 | 0.21 | 0.48 | 15 | 1 | 4.0 | 56.0 | 39.0 | 1 | 3 | 1 | 1.666667 |
| 2829 | 0.712857 | 0.018868 | 0.46 | 0.096305 | 0 | 0.86 | 0.01 | 0.11 | 0.01 | 0.01 | 0.55 | 12 | 1 | 2.0 | 46.0 | 37.0 | 1 | 3 | 1 | 1.666667 |
| 8788 | 0.410000 | 0.018868 | 0.03 | 0.101904 | 2 | 0.85 | 0.00 | 0.12 | 0.02 | 0.01 | 0.28 | 14 | 1 | 2.0 | 3.0 | 36.0 | 1 | 5 | 1 | 2.333333 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7989 | 0.320000 | 0.056604 | 0.33 | 0.090146 | 2 | 0.73 | 0.01 | 0.25 | 0.01 | 0.00 | 0.07 | 2 | 0 | 4.0 | 33.0 | 59.0 | 1 | 4 | 2 | 2.333333 |
| 1383 | 0.831429 | 0.339623 | 0.59 | 0.204367 | 1 | 0.78 | 0.00 | 0.20 | 0.01 | 0.01 | 0.11 | 2 | 0 | 19.0 | 59.0 | 776.0 | 4 | 3 | 4 | 3.666667 |
| 4070 | 0.065714 | 0.320755 | 0.45 | 0.318589 | 3 | 0.30 | 0.12 | 0.36 | 0.10 | 0.12 | 0.13 | 3 | 0 | 18.0 | 45.0 | 720.0 | 4 | 3 | 4 | 3.666667 |
| 7909 | 0.098571 | 0.037736 | 0.65 | 0.102464 | 2 | 0.06 | 0.24 | 0.10 | 0.38 | 0.22 | 0.41 | 15 | 1 | 3.0 | 65.0 | 47.0 | 1 | 2 | 1 | 1.333333 |
| 4914 | 0.612857 | 0.452830 | 0.28 | 0.263718 | 1 | 0.63 | 0.10 | 0.13 | 0.11 | 0.03 | 0.04 | 4 | 0 | 25.0 | 28.0 | 1148.0 | 4 | 4 | 4 | 4.000000 |
9517 rows × 20 columns
#df_merged = df_value_hclust.copy()
#df_merged = df_merged.merge(df_wine_kmeans, left_index=True, right_index=True )
df_merged
df_merged.columns
Index(['Dayswus_mm', 'Freq_mm', 'Recency_mm', 'LTV_mm', 'value_labels',
'Dryred_dec', 'Sweetred_dec', 'Drywh_dec', 'Sweetwh_dec', 'Dessert_dec',
'Exotic_dec', 'labels', 'wine_labels', 'Freq', 'Recency', 'Monetary',
'Frequency_Rank', 'Recency_Rank', 'Monetary_Rank', 'RFM_Ave'],
dtype='object')
df_merged.groupby(['value_labels', 'wine_labels'])\
.size()\
.to_frame()\
.reset_index()\
.pivot('value_labels', 'wine_labels', 0).fillna('-')
| wine_labels | 0 | 1 | 2 |
|---|---|---|---|
| value_labels | |||
| 0 | 1088 | 865 | 1027 |
| 1 | 713 | 325 | 625 |
| 2 | 1201 | 919 | 1062 |
| 3 | 687 | 364 | 641 |
vw_feats = value_feats_mm+wine_feats_dec
df_centroids = df_merged.groupby(['value_labels', 'wine_labels'])[vw_feats].mean()
df_centroids
| Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | Exotic_dec | ||
|---|---|---|---|---|---|---|---|---|---|---|---|
| value_labels | wine_labels | ||||||||||
| 0 | 0 | 0.510616 | 0.180875 | 0.785616 | 0.149469 | 0.524761 | 0.082877 | 0.232399 | 0.069292 | 0.090616 | 0.142564 |
| 1 | 0.570979 | 0.133537 | 0.766509 | 0.124985 | 0.458705 | 0.080231 | 0.297445 | 0.098000 | 0.065376 | 0.293145 | |
| 2 | 0.553638 | 0.167092 | 0.772863 | 0.135983 | 0.569299 | 0.046748 | 0.286796 | 0.046719 | 0.049805 | 0.168520 | |
| 1 | 0 | 0.777481 | 0.540290 | 0.482244 | 0.407824 | 0.565764 | 0.068303 | 0.243226 | 0.057686 | 0.064712 | 0.064236 |
| 1 | 0.784387 | 0.576778 | 0.477754 | 0.443881 | 0.490738 | 0.062769 | 0.325785 | 0.069538 | 0.050923 | 0.116154 | |
| 2 | 0.774176 | 0.563985 | 0.456080 | 0.430316 | 0.480848 | 0.053408 | 0.356256 | 0.055904 | 0.053664 | 0.091280 | |
| 2 | 0 | 0.438938 | 0.113365 | 0.282515 | 0.115817 | 0.500541 | 0.089226 | 0.238843 | 0.072839 | 0.098401 | 0.156045 |
| 1 | 0.436549 | 0.092964 | 0.289859 | 0.109929 | 0.430881 | 0.082764 | 0.300958 | 0.109499 | 0.075778 | 0.307748 | |
| 2 | 0.430517 | 0.107487 | 0.290217 | 0.112255 | 0.553569 | 0.051073 | 0.293842 | 0.051676 | 0.049463 | 0.180669 | |
| 3 | 0 | 0.274325 | 0.467743 | 0.430801 | 0.362451 | 0.550437 | 0.070218 | 0.247394 | 0.063202 | 0.068690 | 0.062678 |
| 1 | 0.260546 | 0.497823 | 0.434368 | 0.395435 | 0.487582 | 0.062280 | 0.326456 | 0.071099 | 0.052473 | 0.098324 | |
| 2 | 0.271382 | 0.486798 | 0.453417 | 0.381729 | 0.458549 | 0.056178 | 0.370250 | 0.060530 | 0.053744 | 0.087410 |
merged_hclust = AgglomerativeClustering(
linkage='ward',
affinity='euclidean',
distance_threshold=0,
n_clusters=None
)
hclust_merged_labels = merged_hclust.fit_predict(df_centroids)
plot_dendrogram(merged_hclust, title='Hierarchical Clustering Dendrogram: Centroids',\
truncate_mode="level", p=15, \
above_threshold_color='k', color_threshold=.65)
final_k = 6
# Re-running the Hierarchical clustering based on the correct number of clusters
hclust_final = AgglomerativeClustering(
linkage='ward',
affinity='euclidean',
n_clusters=final_k
)
hclust_labels_final = hclust_final.fit_predict(df_centroids)
df_centroids['hclust_labels'] = hclust_labels_final
#df_centroids
# Mapper between concatenated clusters and hierarchical clusters
cluster_mapper = df_centroids['hclust_labels'].to_dict()
#print(cluster_mapper)
df_ = df_merged.copy()
# Mapping the hierarchical clusters on the centroids to the observations
df_['merged_labels'] = df_.apply(
lambda row: cluster_mapper[
(row['value_labels'], row['wine_labels'])
], axis=1
)
# Merged cluster centroids
df_.groupby('merged_labels').mean()[vw_feats]
| Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | Exotic_dec | |
|---|---|---|---|---|---|---|---|---|---|---|
| merged_labels | ||||||||||
| 0 | 0.777589 | 0.556326 | 0.471533 | 0.423324 | 0.519188 | 0.061624 | 0.301840 | 0.059333 | 0.057865 | 0.084546 |
| 1 | 0.531506 | 0.174183 | 0.779423 | 0.142921 | 0.546388 | 0.065333 | 0.258813 | 0.058331 | 0.070799 | 0.155168 |
| 2 | 0.270246 | 0.481433 | 0.440136 | 0.376850 | 0.502104 | 0.063191 | 0.310946 | 0.063889 | 0.059539 | 0.079716 |
| 3 | 0.434986 | 0.110606 | 0.286129 | 0.114146 | 0.525426 | 0.071321 | 0.264653 | 0.062908 | 0.075435 | 0.167601 |
| 4 | 0.570979 | 0.133537 | 0.766509 | 0.124985 | 0.458705 | 0.080231 | 0.297445 | 0.098000 | 0.065376 | 0.293145 |
| 5 | 0.436549 | 0.092964 | 0.289859 | 0.109929 | 0.430881 | 0.082764 | 0.300958 | 0.109499 | 0.075778 | 0.307748 |
df_.groupby('merged_labels').count()[vw_feats]
| Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | Exotic_dec | |
|---|---|---|---|---|---|---|---|---|---|---|
| merged_labels | ||||||||||
| 0 | 1663 | 1663 | 1663 | 1663 | 1663 | 1663 | 1663 | 1663 | 1663 | 1663 |
| 1 | 2115 | 2115 | 2115 | 2115 | 2115 | 2115 | 2115 | 2115 | 2115 | 2115 |
| 2 | 1692 | 1692 | 1692 | 1692 | 1692 | 1692 | 1692 | 1692 | 1692 | 1692 |
| 3 | 2263 | 2263 | 2263 | 2263 | 2263 | 2263 | 2263 | 2263 | 2263 | 2263 |
| 4 | 865 | 865 | 865 | 865 | 865 | 865 | 865 | 865 | 865 | 865 |
| 5 | 919 | 919 | 919 | 919 | 919 | 919 | 919 | 919 | 919 | 919 |
df_.to_csv('../../out/data/mergedclusters.csv')
merged_tsne = TSNE(random_state=RANDOM_STATE).fit_transform(df_[vw_feats])
tsne_merged_df = pd.DataFrame(merged_tsne)
plot_clusters(tsne_merged_df, df_['merged_labels'], 'T-SNE Visualization of Merged Clusters')
plot_clusters(tsne_merged_df, df_['value_labels'], 'T-SNE of Merged Clusters : Value Labels')
plot_clusters(tsne_merged_df, df_['wine_labels'], 'T-SNE of Merged Clusters : Wine Labels')
merged_umap = umap.UMAP(random_state=RANDOM_STATE, metric='euclidean', min_dist=1, n_neighbors=150, n_components=2)\
.fit_transform(df_[vw_feats])
umap_merged_df = pd.DataFrame(merged_umap)
plot_clusters(umap_merged_df, df_['merged_labels'], 'UMAP Visualization of Merged Clusters')
#plot_clusters(umap_merged_df, df_['wine_labels'], 'UMAP of Merged Clusters: Wine Labels')
#plot_clusters(umap_merged_df, df_['value_labels'], 'UMAP of Merged Clusters: Value Labels')
df_
| Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | value_labels | Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | ... | labels | wine_labels | Freq | Recency | Monetary | Frequency_Rank | Recency_Rank | Monetary_Rank | RFM_Ave | merged_labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Custid | |||||||||||||||||||||
| 5325 | 0.147143 | 0.358491 | 0.18 | 0.348824 | 3 | 0.67 | 0.04 | 0.26 | 0.02 | 0.01 | ... | 4 | 0 | 20.0 | 18.0 | 826.0 | 4 | 5 | 4 | 4.333333 | 2 |
| 3956 | 0.701429 | 0.660377 | 0.33 | 0.401456 | 1 | 0.49 | 0.00 | 0.46 | 0.01 | 0.03 | ... | 8 | 2 | 36.0 | 33.0 | 1852.0 | 5 | 4 | 5 | 4.666667 | 0 |
| 3681 | 0.165714 | 0.056604 | 0.56 | 0.095745 | 2 | 0.04 | 0.29 | 0.14 | 0.32 | 0.21 | ... | 15 | 1 | 4.0 | 56.0 | 39.0 | 1 | 3 | 1 | 1.666667 | 5 |
| 2829 | 0.712857 | 0.018868 | 0.46 | 0.096305 | 0 | 0.86 | 0.01 | 0.11 | 0.01 | 0.01 | ... | 12 | 1 | 2.0 | 46.0 | 37.0 | 1 | 3 | 1 | 1.666667 | 4 |
| 8788 | 0.410000 | 0.018868 | 0.03 | 0.101904 | 2 | 0.85 | 0.00 | 0.12 | 0.02 | 0.01 | ... | 14 | 1 | 2.0 | 3.0 | 36.0 | 1 | 5 | 1 | 2.333333 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7989 | 0.320000 | 0.056604 | 0.33 | 0.090146 | 2 | 0.73 | 0.01 | 0.25 | 0.01 | 0.00 | ... | 2 | 0 | 4.0 | 33.0 | 59.0 | 1 | 4 | 2 | 2.333333 | 3 |
| 1383 | 0.831429 | 0.339623 | 0.59 | 0.204367 | 1 | 0.78 | 0.00 | 0.20 | 0.01 | 0.01 | ... | 2 | 0 | 19.0 | 59.0 | 776.0 | 4 | 3 | 4 | 3.666667 | 0 |
| 4070 | 0.065714 | 0.320755 | 0.45 | 0.318589 | 3 | 0.30 | 0.12 | 0.36 | 0.10 | 0.12 | ... | 3 | 0 | 18.0 | 45.0 | 720.0 | 4 | 3 | 4 | 3.666667 | 2 |
| 7909 | 0.098571 | 0.037736 | 0.65 | 0.102464 | 2 | 0.06 | 0.24 | 0.10 | 0.38 | 0.22 | ... | 15 | 1 | 3.0 | 65.0 | 47.0 | 1 | 2 | 1 | 1.333333 | 5 |
| 4914 | 0.612857 | 0.452830 | 0.28 | 0.263718 | 1 | 0.63 | 0.10 | 0.13 | 0.11 | 0.03 | ... | 4 | 0 | 25.0 | 28.0 | 1148.0 | 4 | 4 | 4 | 4.000000 | 0 |
9517 rows × 21 columns
#df_wine_rfm = df_wine_kmeans.copy()
df_wine_rfm = pd.concat([df_wine_kmeans,rfm_df],axis=1)
df_wine_rfm['wine_labels'].unique().tolist()
[0, 2, 1]
wine_labels_list = sorted(df_wine_rfm['wine_labels'].unique().tolist())
fig, axes = plt.subplots(1,len(wine_labels_list), figsize=(17,13), sharey=True)
wine_markers = ['.','+','x']
for l, ax in zip(wine_labels_list, axes.flatten()):
for q in range(len(quantile_values)):
ax.axvline(quantile_values[q]+.49,
label=str(int(quantile_values[q]))+'th Percentile',
color=COLORS[3])
ax.axhline(freq_quints[q]+.5,
label=str(int(quantile_values[q]))+'th Percentile',
color=COLORS[3])
ax.scatter(data=df_wine_rfm.loc[df_wine_rfm['wine_labels']==l,:], x='Recency', y='Freq', #hue='wine_labels',
alpha=.5, s=80, marker=wine_markers[1], cmap=CAT_CMAP,
)
if l== 0:
ax.set_ylabel('Frequency', fontsize=20)
ax.set_xlabel('Recency', fontsize=20)
ax.set_title('Wine Cluster '+str(l), fontsize=20)
#ax.legend([])
#rfm_title='Recency vs Frequency Quintiles'
#plt.suptitle(rfm_title, fontsize=20, y=.91)
#plt.margins(0.025, 0.05)
#if SAVE_PLOTS:
# save_fig(rfm_title, fig)
plt.tight_layout()
plt.show()
df_value_rfm = pd.concat([df_value_kmeans,rfm_df],axis=1)
sorted(df_value_rfm['value_labels'].unique().tolist())
[0, 1, 2, 3]
val_labels_list = sorted(df_value_rfm['value_labels'].unique().tolist())
fig, axes = plt.subplots(1,len(val_labels_list), figsize=(17,13), sharey=True)
val_markers = ['.','+','x']
for l, ax in zip(val_labels_list, axes.flatten()):
for q in range(len(quantile_values)):
ax.axvline(quantile_values[q]+.49,
label=str(int(quantile_values[q]))+'th Percentile',
color=COLORS[3])
ax.axhline(freq_quints[q]+.5,
label=str(int(quantile_values[q]))+'th Percentile',
color=COLORS[3])
ax.scatter(data=df_value_rfm.loc[df_value_rfm['value_labels']==l,:], x='Recency', y='Freq', #hue='wine_labels',
alpha=.5, s=80, marker=val_markers[1], cmap=CAT_CMAP,
)
if l== 0:
ax.set_ylabel('Frequency', fontsize=20)
ax.set_xlabel('Recency', fontsize=20)
ax.set_title('Value Cluster '+str(l), fontsize=20)
#ax.legend([])
#rfm_title='Recency vs Frequency Quintiles'
#plt.suptitle(rfm_title, fontsize=20, y=.91)
#plt.margins(0.025, 0.05)
#if SAVE_PLOTS:
# save_fig(rfm_title, fig)
plt.tight_layout()
plt.show()
## Function to plot histograms of numeric features for specified dataframe
def plot_final_histo_box(df, features, col, title = "Final Clusters: Relative Distributions of Numeric Variables"):
if show_plots:
rows = sorted(df[col].unique().tolist())
cols = range(len(features))
fig = plt.figure(figsize=(22,28), \
constrained_layout=True)
subfigs = fig.subfigures(len(rows), len(features), facecolor='#fdfdfd')
for c in cols: # feats
for r in rows: # clusters
color = cm.viridis(float(r) / len(rows))
df_ = df.loc[df[col]==r,[features[c]]]
axs = subfigs[r][c].subplots(2, 1, sharex=True, \
gridspec_kw={'height_ratios': [4,1]})
axs[0].hist(df_, color=color)
axs[0].set_xlim(0,1)
axs[0].set_title(features[c], y=1, fontsize=20)
sns.boxplot(x=df_[features[c]], ax=axs[1], color=color)
axs[1].set_xlabel(None)
if c==0:
csize = ' [ ' + str(len(df_)) + ' ]'
axs[0].set_ylabel('Cluster '+ str(r)+csize)
plt.suptitle(title, fontsize=24)
if SAVE_PLOTS:
save_fig(title, fig)
plt.show()
else:
print("show_plots is currently set to False")
df_.columns
Index(['Dayswus_mm', 'Freq_mm', 'Recency_mm', 'LTV_mm', 'value_labels',
'Dryred_dec', 'Sweetred_dec', 'Drywh_dec', 'Sweetwh_dec', 'Dessert_dec',
'Exotic_dec', 'labels', 'wine_labels', 'Freq', 'Recency', 'Monetary',
'Frequency_Rank', 'Recency_Rank', 'Monetary_Rank', 'RFM_Ave',
'merged_labels'],
dtype='object')
plot_final_histo_box(df_, wine_feats_dec, 'merged_labels', title='Final Clusters, Wine Features')
plot_final_histo_box(df_, value_feats_mm, 'merged_labels', title='Final Clusters, Value Features')
#plot_final_histo_box(df_, wine_feats_dec, 'wine_labels', title='Wine Segmentation, Wine Features')
#plot_final_histo_box(df_, value_feats_mm, 'value_labels', title='Value Segmentation, Value Features')
def hist_cluster_vs_all(df, k, col, feats, title='Mean Values, Clusters vs Population'):
clustermeans_ = df.groupby(col).mean()[feats].reset_index()
fig, axes = plt.subplots(2,int(len(feats)/2), figsize=(17,7), constrained_layout=True)
i = 0
colors = [ (cm.viridis(float(i) / len(clustermeans_['merged_labels']))) for i in range(len(clustermeans_['merged_labels'])) ]
for m, ax in zip(feats, axes.flatten()):
ax.bar(height=clustermeans_.loc[:,m], x=clustermeans_['merged_labels'].astype(str), color=colors)
m_ = df[feats].mean()[m]
ax.axhline(m_, label='Pop. mean', color='k',linestyle='dashed')
ax.set_xlabel('Cluster Labels')
if m == wine_feats_dec[-1]:
ax.legend(bbox_to_anchor=(1,1.02), loc="upper right", frameon=False)
ax.set_title(m)
fig.suptitle(title, fontsize=20)
save_fig(title, fig)
plt.show()
hist_cluster_vs_all(df_,5,'merged_labels', vw_feats)
df_[value_feats_mm]
df_[wine_feats_dec]
| Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | Exotic_dec | |
|---|---|---|---|---|---|---|
| Custid | ||||||
| 5325 | 0.67 | 0.04 | 0.26 | 0.02 | 0.01 | 0.01 |
| 3956 | 0.49 | 0.00 | 0.46 | 0.01 | 0.03 | 0.00 |
| 3681 | 0.04 | 0.29 | 0.14 | 0.32 | 0.21 | 0.48 |
| 2829 | 0.86 | 0.01 | 0.11 | 0.01 | 0.01 | 0.55 |
| 8788 | 0.85 | 0.00 | 0.12 | 0.02 | 0.01 | 0.28 |
| ... | ... | ... | ... | ... | ... | ... |
| 7989 | 0.73 | 0.01 | 0.25 | 0.01 | 0.00 | 0.07 |
| 1383 | 0.78 | 0.00 | 0.20 | 0.01 | 0.01 | 0.11 |
| 4070 | 0.30 | 0.12 | 0.36 | 0.10 | 0.12 | 0.13 |
| 7909 | 0.06 | 0.24 | 0.10 | 0.38 | 0.22 | 0.41 |
| 4914 | 0.63 | 0.10 | 0.13 | 0.11 | 0.03 | 0.04 |
9517 rows × 6 columns
def show_value_wine_scatter(df, row_feats, col_feats, title='Scatter Plot of Features'):
fig, axes = plt.subplots(len(row_feats), len(col_feats),
figsize=(15,11), sharex=True)
for row in range(len(row_feats)):
for col in range(len(col_feats)):
x=df[row_feats[row]]
y=df[col_feats[col]]
axes[row][col].scatter(x,y, alpha=.25, s=1)
if col == 0:
axes[row][col].set_ylabel(row_feats[row])
if row == 0:
axes[row][col].set_title(col_feats[col], y=1)
plt.suptitle(title)
plt.tight_layout()
plt.show()
df_
| Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | value_labels | Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | ... | labels | wine_labels | Freq | Recency | Monetary | Frequency_Rank | Recency_Rank | Monetary_Rank | RFM_Ave | merged_labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Custid | |||||||||||||||||||||
| 5325 | 0.147143 | 0.358491 | 0.18 | 0.348824 | 3 | 0.67 | 0.04 | 0.26 | 0.02 | 0.01 | ... | 4 | 0 | 20.0 | 18.0 | 826.0 | 4 | 5 | 4 | 4.333333 | 2 |
| 3956 | 0.701429 | 0.660377 | 0.33 | 0.401456 | 1 | 0.49 | 0.00 | 0.46 | 0.01 | 0.03 | ... | 8 | 2 | 36.0 | 33.0 | 1852.0 | 5 | 4 | 5 | 4.666667 | 0 |
| 3681 | 0.165714 | 0.056604 | 0.56 | 0.095745 | 2 | 0.04 | 0.29 | 0.14 | 0.32 | 0.21 | ... | 15 | 1 | 4.0 | 56.0 | 39.0 | 1 | 3 | 1 | 1.666667 | 5 |
| 2829 | 0.712857 | 0.018868 | 0.46 | 0.096305 | 0 | 0.86 | 0.01 | 0.11 | 0.01 | 0.01 | ... | 12 | 1 | 2.0 | 46.0 | 37.0 | 1 | 3 | 1 | 1.666667 | 4 |
| 8788 | 0.410000 | 0.018868 | 0.03 | 0.101904 | 2 | 0.85 | 0.00 | 0.12 | 0.02 | 0.01 | ... | 14 | 1 | 2.0 | 3.0 | 36.0 | 1 | 5 | 1 | 2.333333 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7989 | 0.320000 | 0.056604 | 0.33 | 0.090146 | 2 | 0.73 | 0.01 | 0.25 | 0.01 | 0.00 | ... | 2 | 0 | 4.0 | 33.0 | 59.0 | 1 | 4 | 2 | 2.333333 | 3 |
| 1383 | 0.831429 | 0.339623 | 0.59 | 0.204367 | 1 | 0.78 | 0.00 | 0.20 | 0.01 | 0.01 | ... | 2 | 0 | 19.0 | 59.0 | 776.0 | 4 | 3 | 4 | 3.666667 | 0 |
| 4070 | 0.065714 | 0.320755 | 0.45 | 0.318589 | 3 | 0.30 | 0.12 | 0.36 | 0.10 | 0.12 | ... | 3 | 0 | 18.0 | 45.0 | 720.0 | 4 | 3 | 4 | 3.666667 | 2 |
| 7909 | 0.098571 | 0.037736 | 0.65 | 0.102464 | 2 | 0.06 | 0.24 | 0.10 | 0.38 | 0.22 | ... | 15 | 1 | 3.0 | 65.0 | 47.0 | 1 | 2 | 1 | 1.333333 | 5 |
| 4914 | 0.612857 | 0.452830 | 0.28 | 0.263718 | 1 | 0.63 | 0.10 | 0.13 | 0.11 | 0.03 | ... | 4 | 0 | 25.0 | 28.0 | 1148.0 | 4 | 4 | 4 | 4.000000 | 0 |
9517 rows × 21 columns
df_.loc[df_['Recency']<=100,:]
| Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | value_labels | Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | ... | labels | wine_labels | Freq | Recency | Monetary | Frequency_Rank | Recency_Rank | Monetary_Rank | RFM_Ave | merged_labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Custid | |||||||||||||||||||||
| 5325 | 0.147143 | 0.358491 | 0.18 | 0.348824 | 3 | 0.67 | 0.04 | 0.26 | 0.02 | 0.01 | ... | 4 | 0 | 20.0 | 18.0 | 826.0 | 4 | 5 | 4 | 4.333333 | 2 |
| 3956 | 0.701429 | 0.660377 | 0.33 | 0.401456 | 1 | 0.49 | 0.00 | 0.46 | 0.01 | 0.03 | ... | 8 | 2 | 36.0 | 33.0 | 1852.0 | 5 | 4 | 5 | 4.666667 | 0 |
| 3681 | 0.165714 | 0.056604 | 0.56 | 0.095745 | 2 | 0.04 | 0.29 | 0.14 | 0.32 | 0.21 | ... | 15 | 1 | 4.0 | 56.0 | 39.0 | 1 | 3 | 1 | 1.666667 | 5 |
| 2829 | 0.712857 | 0.018868 | 0.46 | 0.096305 | 0 | 0.86 | 0.01 | 0.11 | 0.01 | 0.01 | ... | 12 | 1 | 2.0 | 46.0 | 37.0 | 1 | 3 | 1 | 1.666667 | 4 |
| 8788 | 0.410000 | 0.018868 | 0.03 | 0.101904 | 2 | 0.85 | 0.00 | 0.12 | 0.02 | 0.01 | ... | 14 | 1 | 2.0 | 3.0 | 36.0 | 1 | 5 | 1 | 2.333333 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7989 | 0.320000 | 0.056604 | 0.33 | 0.090146 | 2 | 0.73 | 0.01 | 0.25 | 0.01 | 0.00 | ... | 2 | 0 | 4.0 | 33.0 | 59.0 | 1 | 4 | 2 | 2.333333 | 3 |
| 1383 | 0.831429 | 0.339623 | 0.59 | 0.204367 | 1 | 0.78 | 0.00 | 0.20 | 0.01 | 0.01 | ... | 2 | 0 | 19.0 | 59.0 | 776.0 | 4 | 3 | 4 | 3.666667 | 0 |
| 4070 | 0.065714 | 0.320755 | 0.45 | 0.318589 | 3 | 0.30 | 0.12 | 0.36 | 0.10 | 0.12 | ... | 3 | 0 | 18.0 | 45.0 | 720.0 | 4 | 3 | 4 | 3.666667 | 2 |
| 7909 | 0.098571 | 0.037736 | 0.65 | 0.102464 | 2 | 0.06 | 0.24 | 0.10 | 0.38 | 0.22 | ... | 15 | 1 | 3.0 | 65.0 | 47.0 | 1 | 2 | 1 | 1.333333 | 5 |
| 4914 | 0.612857 | 0.452830 | 0.28 | 0.263718 | 1 | 0.63 | 0.10 | 0.13 | 0.11 | 0.03 | ... | 4 | 0 | 25.0 | 28.0 | 1148.0 | 4 | 4 | 4 | 4.000000 | 0 |
9517 rows × 21 columns
show_value_wine_scatter(df_, value_feats_mm, wine_feats_dec, title='Scatter Plot of Wine Features against Value Features')
clusterlabels = sorted(df_['wine_labels'].unique().tolist())
for c in clusterlabels:
plottitle ='Scatter Plot of Wine Features against Value Features: Cluster '+ str(c)
show_value_wine_scatter(df_.loc[df_['wine_labels']==c,:], value_feats_mm, wine_feats_dec, plottitle)
df_original
df_
| Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | value_labels | Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | ... | labels | wine_labels | Freq | Recency | Monetary | Frequency_Rank | Recency_Rank | Monetary_Rank | RFM_Ave | merged_labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Custid | |||||||||||||||||||||
| 5325 | 0.147143 | 0.358491 | 0.18 | 0.348824 | 3 | 0.67 | 0.04 | 0.26 | 0.02 | 0.01 | ... | 4 | 0 | 20.0 | 18.0 | 826.0 | 4 | 5 | 4 | 4.333333 | 2 |
| 3956 | 0.701429 | 0.660377 | 0.33 | 0.401456 | 1 | 0.49 | 0.00 | 0.46 | 0.01 | 0.03 | ... | 8 | 2 | 36.0 | 33.0 | 1852.0 | 5 | 4 | 5 | 4.666667 | 0 |
| 3681 | 0.165714 | 0.056604 | 0.56 | 0.095745 | 2 | 0.04 | 0.29 | 0.14 | 0.32 | 0.21 | ... | 15 | 1 | 4.0 | 56.0 | 39.0 | 1 | 3 | 1 | 1.666667 | 5 |
| 2829 | 0.712857 | 0.018868 | 0.46 | 0.096305 | 0 | 0.86 | 0.01 | 0.11 | 0.01 | 0.01 | ... | 12 | 1 | 2.0 | 46.0 | 37.0 | 1 | 3 | 1 | 1.666667 | 4 |
| 8788 | 0.410000 | 0.018868 | 0.03 | 0.101904 | 2 | 0.85 | 0.00 | 0.12 | 0.02 | 0.01 | ... | 14 | 1 | 2.0 | 3.0 | 36.0 | 1 | 5 | 1 | 2.333333 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7989 | 0.320000 | 0.056604 | 0.33 | 0.090146 | 2 | 0.73 | 0.01 | 0.25 | 0.01 | 0.00 | ... | 2 | 0 | 4.0 | 33.0 | 59.0 | 1 | 4 | 2 | 2.333333 | 3 |
| 1383 | 0.831429 | 0.339623 | 0.59 | 0.204367 | 1 | 0.78 | 0.00 | 0.20 | 0.01 | 0.01 | ... | 2 | 0 | 19.0 | 59.0 | 776.0 | 4 | 3 | 4 | 3.666667 | 0 |
| 4070 | 0.065714 | 0.320755 | 0.45 | 0.318589 | 3 | 0.30 | 0.12 | 0.36 | 0.10 | 0.12 | ... | 3 | 0 | 18.0 | 45.0 | 720.0 | 4 | 3 | 4 | 3.666667 | 2 |
| 7909 | 0.098571 | 0.037736 | 0.65 | 0.102464 | 2 | 0.06 | 0.24 | 0.10 | 0.38 | 0.22 | ... | 15 | 1 | 3.0 | 65.0 | 47.0 | 1 | 2 | 1 | 1.333333 | 5 |
| 4914 | 0.612857 | 0.452830 | 0.28 | 0.263718 | 1 | 0.63 | 0.10 | 0.13 | 0.11 | 0.03 | ... | 4 | 0 | 25.0 | 28.0 | 1148.0 | 4 | 4 | 4 | 4.000000 | 0 |
9517 rows × 21 columns
df_full = pd.merge(df_original, df_, how='left',left_index=True, right_index=True, suffixes=('', '_drop')) #.drop_duplicates()
df_full.drop([col for col in df_full.columns if 'drop' in col], axis=1, inplace=True)
df_full
#df_full['merged_labels'].isna().sum()
| Dayswus | Age | Edu | Income | Freq | Recency | Monetary | LTV | Perdeal | Dryred | ... | Sweetwh_dec | Dessert_dec | Exotic_dec | labels | wine_labels | Frequency_Rank | Recency_Rank | Monetary_Rank | RFM_Ave | merged_labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Custid | |||||||||||||||||||||
| 5325 | 653.0 | 55.0 | 20.0 | 78473.0 | 20.0 | 18.0 | 826.0 | 445.0 | 7.0 | 67.0 | ... | 0.02 | 0.01 | 0.01 | 4.0 | 0.0 | 4.0 | 5.0 | 4.0 | 4.333333 | 2.0 |
| 3956 | 1041.0 | 75.0 | 18.0 | 105087.0 | 36.0 | 33.0 | 1852.0 | 539.0 | 2.0 | 49.0 | ... | 0.01 | 0.03 | 0.00 | 8.0 | 2.0 | 5.0 | 4.0 | 5.0 | 4.666667 | 0.0 |
| 3681 | 666.0 | 18.0 | 12.0 | 27984.0 | 4.0 | 56.0 | 39.0 | -7.0 | 88.0 | 4.0 | ... | 0.32 | 0.21 | 0.48 | 15.0 | 1.0 | 1.0 | 3.0 | 1.0 | 1.666667 | 5.0 |
| 2829 | 1049.0 | 42.0 | 16.0 | 61748.0 | 2.0 | 46.0 | 37.0 | -6.0 | 70.0 | 86.0 | ... | 0.01 | 0.01 | 0.55 | 12.0 | 1.0 | 1.0 | 3.0 | 1.0 | 1.666667 | 4.0 |
| 8788 | 837.0 | 47.0 | 16.0 | 65789.0 | 2.0 | 3.0 | 36.0 | 4.0 | 35.0 | 85.0 | ... | 0.02 | 0.01 | 0.28 | 14.0 | 1.0 | 1.0 | 5.0 | 1.0 | 2.333333 | 5.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1383 | 1132.0 | 57.0 | 20.0 | 81033.0 | 19.0 | 59.0 | 776.0 | 187.0 | 22.0 | 78.0 | ... | 0.01 | 0.01 | 0.11 | 2.0 | 0.0 | 4.0 | 3.0 | 4.0 | 3.666667 | 0.0 |
| 4070 | 596.0 | 66.0 | 15.0 | 84714.0 | 18.0 | 45.0 | 720.0 | 391.0 | 5.0 | 30.0 | ... | 0.10 | 0.12 | 0.13 | 3.0 | 0.0 | 4.0 | 3.0 | 4.0 | 3.666667 | 2.0 |
| 7909 | 619.0 | 18.0 | 12.0 | 40466.0 | 3.0 | 65.0 | 47.0 | 5.0 | 23.0 | 6.0 | ... | 0.38 | 0.22 | 0.41 | 15.0 | 1.0 | 1.0 | 2.0 | 1.0 | 1.333333 | 5.0 |
| 4158 | 1107.0 | 33.0 | 16.0 | 53661.0 | 1.0 | 368.0 | 15.0 | 2.0 | 35.0 | 18.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4914 | 979.0 | 55.0 | 16.0 | 94926.0 | 25.0 | 28.0 | 1148.0 | 293.0 | 7.0 | 63.0 | ... | 0.11 | 0.03 | 0.04 | 4.0 | 0.0 | 4.0 | 4.0 | 4.0 | 4.000000 | 0.0 |
10000 rows × 35 columns
Xv = df_full.loc[~df_full['merged_labels'].isna(),value_features]
yv = df_full.loc[~df_full['merged_labels'].isna(),'value_labels']
X_train, X_test, y_train, y_test = train_test_split(
Xv, yv, test_size=0.2, random_state=RANDOM_STATE
)
# Fitting the decision tree
dtv = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=5)
dtv.fit(X_train, y_train)
print("On average, we are able to predict an estimated {0:.2f}% of the customers' Value labels correctly".format(dtv.score(X_test, y_test)*100))
On average, we are able to predict an estimated 89.60% of the customers' Value labels correctly
pd.DataFrame(dtv.feature_importances_, index=X_train.columns).sort_values(by=0, ascending=False)
| 0 | |
|---|---|
| Recency | 0.390685 |
| Freq | 0.270122 |
| Dayswus | 0.265932 |
| LTV | 0.073261 |
# Predicting the cluster labels of the noise and outliers
df_full.loc[df_full['merged_labels'].isna(),'value_labels'] = dtv.predict(df_full.loc[df_full['merged_labels'].isna(),value_features])
Xw = df_full.loc[~df_full['merged_labels'].isna(),wine_features]
yw = df_full.loc[~df_full['merged_labels'].isna(),'wine_labels']
Xw_train, Xw_test, yw_train, yw_test = train_test_split(
Xw, yw, test_size=0.2, random_state=RANDOM_STATE
)
# Fitting the decision tree
dtw = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=5)
dtw.fit(Xw_train, yw_train)
print("On average, we are able to predict an estimated {0:.2f}% of the customers' Wine labels correctly".format(dtw.score(Xw_test, yw_test)*100))
On average, we are able to predict an estimated 86.03% of the customers' Wine labels correctly
pd.DataFrame(dtw.feature_importances_, index=Xw_train.columns).sort_values(by=0, ascending=False)
| 0 | |
|---|---|
| Dryred | 0.429693 |
| Exotic | 0.313179 |
| Drywh | 0.213042 |
| Sweetwh | 0.036956 |
| Sweetred | 0.007130 |
| Dessert | 0.000000 |
df_full.loc[df_full['merged_labels'].isna(),'wine_labels'] = dtw.predict(df_full.loc[df_full['merged_labels'].isna(),wine_features])
val_labels_list
[0, 1, 2, 3]
fig, axes = plt.subplots(1,1,figsize = (80,24))
tree.plot_tree(dtv,
feature_names = Xv.columns.to_list(),
class_names=['0','1','2','3','4','5'],
filled = False,
fontsize=24, ax=axes);
plt.show()
fig, axes = plt.subplots(1,1,figsize = (80,24))
tree.plot_tree(dtw,
feature_names = Xw.columns.to_list(),
class_names=['0','1','2','3','4'],
filled = False,
fontsize=24, ax=axes);
plt.show()
## Code based from sklearn documentation:
## https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html?highlight=silhouette
def plotSilhouettes(X, max_clust, plot_dim, title = "Silhouette analysis for KMeans clustering with different K sizes"):
range_n_clusters = range(2, max_clust+1)
fig, axes = plt.subplots(plot_dim[0], plot_dim[1], sharex=True, figsize=(19,11))
#fig.set_size_inches(11, 11)
for ax, nclust in zip(axes.flatten(), range_n_clusters):
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax.set_ylim([0, len(X) + (nclust + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=nclust, random_state=10)
cluster_labels = clusterer.fit_predict(X)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
silhouette_avg = silhouette_score(X, cluster_labels)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(nclust):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.viridis(float(i) / nclust)
#color = COLORS[i]
ax.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
# Label the silhouette plots with their cluster numbers at the middle
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize=14)
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax.set_title("K = "+ str(nclust))
ax.set_ylabel("Cluster label")
ax.set_yticks([]) # Clear the yaxis labels / ticks
ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# The vertical line for average silhouette score of all the values
ax.axvline(x=silhouette_avg, color="red", linestyle="--", label='Average\nSilhouette\nScore')
if nclust == 2 :
ax.legend(frameon=False)
plt.suptitle(
title,
fontsize=18,
y=.95
)
save_fig(title, fig)
plt.show()
plotSilhouettes(df_wine_kmeans[wine_feats_dec], 9, [2,4], title='Silhouette Analysis for Kmeans Clustering at Various K, Wine Segmentation')
plotSilhouettes(df_value_kmeans[value_feats_mm], 9, [2,4], title='Silhouette Analysis for Kmeans Clustering at Various K, Value Segmentation')
df_
| Dayswus_mm | Freq_mm | Recency_mm | LTV_mm | value_labels | Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | ... | labels | wine_labels | Freq | Recency | Monetary | Frequency_Rank | Recency_Rank | Monetary_Rank | RFM_Ave | merged_labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Custid | |||||||||||||||||||||
| 5325 | 0.147143 | 0.358491 | 0.18 | 0.348824 | 3 | 0.67 | 0.04 | 0.26 | 0.02 | 0.01 | ... | 4 | 0 | 20.0 | 18.0 | 826.0 | 4 | 5 | 4 | 4.333333 | 2 |
| 3956 | 0.701429 | 0.660377 | 0.33 | 0.401456 | 1 | 0.49 | 0.00 | 0.46 | 0.01 | 0.03 | ... | 8 | 2 | 36.0 | 33.0 | 1852.0 | 5 | 4 | 5 | 4.666667 | 0 |
| 3681 | 0.165714 | 0.056604 | 0.56 | 0.095745 | 2 | 0.04 | 0.29 | 0.14 | 0.32 | 0.21 | ... | 15 | 1 | 4.0 | 56.0 | 39.0 | 1 | 3 | 1 | 1.666667 | 5 |
| 2829 | 0.712857 | 0.018868 | 0.46 | 0.096305 | 0 | 0.86 | 0.01 | 0.11 | 0.01 | 0.01 | ... | 12 | 1 | 2.0 | 46.0 | 37.0 | 1 | 3 | 1 | 1.666667 | 4 |
| 8788 | 0.410000 | 0.018868 | 0.03 | 0.101904 | 2 | 0.85 | 0.00 | 0.12 | 0.02 | 0.01 | ... | 14 | 1 | 2.0 | 3.0 | 36.0 | 1 | 5 | 1 | 2.333333 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7989 | 0.320000 | 0.056604 | 0.33 | 0.090146 | 2 | 0.73 | 0.01 | 0.25 | 0.01 | 0.00 | ... | 2 | 0 | 4.0 | 33.0 | 59.0 | 1 | 4 | 2 | 2.333333 | 3 |
| 1383 | 0.831429 | 0.339623 | 0.59 | 0.204367 | 1 | 0.78 | 0.00 | 0.20 | 0.01 | 0.01 | ... | 2 | 0 | 19.0 | 59.0 | 776.0 | 4 | 3 | 4 | 3.666667 | 0 |
| 4070 | 0.065714 | 0.320755 | 0.45 | 0.318589 | 3 | 0.30 | 0.12 | 0.36 | 0.10 | 0.12 | ... | 3 | 0 | 18.0 | 45.0 | 720.0 | 4 | 3 | 4 | 3.666667 | 2 |
| 7909 | 0.098571 | 0.037736 | 0.65 | 0.102464 | 2 | 0.06 | 0.24 | 0.10 | 0.38 | 0.22 | ... | 15 | 1 | 3.0 | 65.0 | 47.0 | 1 | 2 | 1 | 1.333333 | 5 |
| 4914 | 0.612857 | 0.452830 | 0.28 | 0.263718 | 1 | 0.63 | 0.10 | 0.13 | 0.11 | 0.03 | ... | 4 | 0 | 25.0 | 28.0 | 1148.0 | 4 | 4 | 4 | 4.000000 | 0 |
9517 rows × 21 columns
som_vars = value_feats_mm
df_som = df_nonoise.copy()
## From Lab 11
np.random.seed(RANDOM_STATE)
smv = sompy.SOMFactory().build(
df_som[som_vars].values,
mapsize=[15,15],
initialization='random',
neighborhood='gaussian',
training='batch',
lattice='hexa',
component_names=som_vars
)
smv.train(n_job=4, train_rough_len=50, train_finetune_len=50)
u = sompy.umatrix.UMatrixView(8, 8, 'umatrix', show_axis=True, text_size=8, show_text=True)
UMAT = u.show(
smv,
distance=4,
row_normalized=False,
show_data=True,
contour=True, # Visualize isomorphic curves
blob=True
)
vhts = BmuHitsView(9,9,"Hits Map")
vhts.show(smv, anotate=True, onlyzeros=False, labelsize=8, cmap="Blues")
plt.show()
def cluster_on_som(sm, k, df_, som_vars):
df = df_.copy()
kmeans = KMeans(n_clusters=k, init='k-means++', n_init=20, random_state=random_state)
nodeclus_labels = kmeans.fit_predict(sm.codebook.matrix)
sm.cluster_labels = nodeclus_labels
# Check the nodes and and respective clusters
nodes = sm.codebook.matrix
somlabelk = 'som_label_'+str(k)
df_nodes = pd.DataFrame(nodes, columns=som_vars)
df_nodes[somlabelk] = nodeclus_labels
# Obtaining SOM's BMUs labels
bmus_map = sm.find_bmu(df[som_vars])[0] # get bmus for each observation in df
bmuk = 'BMU'+str(k)
df[bmuk] = bmus_map
df_bmus = df.copy()
# Get cluster labels for each observation
df = df_bmus.merge(df_nodes[somlabelk], 'left', left_on=bmuk, right_index=True)
s_ = df.groupby([somlabelk]).count().iloc[:,0].tolist()
r_ = r2_(df[som_vars + [somlabelk]],somlabelk)
return ({'k':k,'r':r_,'sizes':s_}, df[somlabelk])
kval = 10
som_k_sizes_v = pd.DataFrame(columns=['k','r','sizes'])
for k in range(2,kval+2):
cluster_on_som_res_d = cluster_on_som(smv,k, df_som, som_vars)
som_k_sizes_v = som_k_sizes_v.append(cluster_on_som_res_d[0],ignore_index=True)
som_k_sizes_v
| k | r | sizes | |
|---|---|---|---|
| 0 | 2 | 0.603309 | [4429, 5088] |
| 1 | 3 | 0.663527 | [510, 1704, 7303] |
| 2 | 4 | 0.857904 | [7499, 717, 1301] |
| 3 | 5 | 0.880970 | [717, 1842, 5657, 1301] |
| 4 | 6 | 0.852878 | [7538, 865, 1114] |
| 5 | 7 | 0.622983 | [64, 377, 8873, 203] |
| 6 | 8 | 0.850845 | [3, 865, 7578, 1071] |
| 7 | 9 | 0.891221 | [8379, 1071, 64, 3] |
| 8 | 10 | 0.922906 | [103, 865, 1074, 7475] |
| 9 | 11 | 0.975455 | [804, 1211, 200, 7302] |
som_k_sizes_v.sort_values(by=['r'], ascending=False)
| k | r | sizes | |
|---|---|---|---|
| 9 | 11 | 0.975455 | [804, 1211, 200, 7302] |
| 8 | 10 | 0.922906 | [103, 865, 1074, 7475] |
| 7 | 9 | 0.891221 | [8379, 1071, 64, 3] |
| 3 | 5 | 0.880970 | [717, 1842, 5657, 1301] |
| 2 | 4 | 0.857904 | [7499, 717, 1301] |
| 4 | 6 | 0.852878 | [7538, 865, 1114] |
| 6 | 8 | 0.850845 | [3, 865, 7578, 1071] |
| 1 | 3 | 0.663527 | [510, 1704, 7303] |
| 5 | 7 | 0.622983 | [64, 377, 8873, 203] |
| 0 | 2 | 0.603309 | [4429, 5088] |
k = 8
cluster_on_som_res = cluster_on_som(smv,k, df_som, som_vars)
df_som['som_value'] = cluster_on_som_res[1]
# Perform K-Means clustering with selected K
kmeans = KMeans(n_clusters=k, init='k-means++', n_init=20, random_state=random_state)
nodeclus_labels = kmeans.fit_predict(smv.codebook.matrix)
smv.cluster_labels = nodeclus_labels # setting the cluster labels of sompy
hits = HitMapView(8, 8,"Clustering", text_size=10)
hits.show(smv, anotate=True, onlyzeros=False, labelsize=7, cmap="Pastel1")
plt.show()
tsnemodel_somv = TSNE(random_state=RANDOM_STATE).fit_transform(df_som[value_feats_mm])
tsne_value_somv = pd.DataFrame(tsnemodel_somv)
plot_clusters(tsne_value_somv, df_som['som_value'], 'T-SNE Visualization of Value Segmentation Using SOM')
df_wine = df[wine_feats_dec].copy()
np.random.seed(RANDOM_STATE)
som_vars_w = wine_feats_dec
# Training the SOM
smw = sompy.SOMFactory().build(
df_wine.values,
mapsize=[15, 15],
initialization='random',
neighborhood='gaussian',
training='batch',
lattice='hexa',
component_names=wine_feats_dec
)
smw.train(n_job=-1, verbose='info', train_rough_len=100, train_finetune_len=100)
kmeans_somw = KMeans(n_clusters=4, init='k-means++', n_init=20, random_state=RANDOM_STATE)
nodeclus_labels = kmeans_somw.fit_predict(smw.codebook.matrix)
smw.cluster_labels = nodeclus_labels # setting the cluster labels of sompy
hits = HitMapView(12, 12,"Clustering", text_size=10)
hits.show(smw, anotate=True, onlyzeros=False, labelsize=7, cmap="Pastel1")
plt.show()
# Check the nodes and and respective clusters
wine_som_nodes = smw.codebook.matrix
df_wine_som_nodes = pd.DataFrame(wine_som_nodes, columns=wine_feats_dec)
df_wine_som_nodes['wine_nodes'] = nodeclus_labels
#df_wine_som_nodes
# Obtaining SOM's BMUs labels
bmus_map = smw.find_bmu(df_wine)[0] # get bmus for each observation in df
df_bmus = pd.DataFrame(
np.concatenate((df, np.expand_dims(bmus_map,1)), axis=1),
index=df.index, columns=np.append(df.columns,"BMU")
)
#df_bmus
# Get cluster labels for each observation
dfw_ = df_bmus.merge(df_wine_som_nodes['wine_nodes'], 'left', left_on="BMU", right_index=True)
# Characterizing the final clusters
dfw_.drop(columns='BMU').groupby('wine_nodes').mean()
| Dayswus | Age | Edu | Income | Freq | Recency | Monetary | LTV | Perdeal | Dryred | ... | Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | Exotic_dec | Recency_Rank | Frequency_Rank | Monetary_Rank | RFM_Ave | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| wine_nodes | |||||||||||||||||||||
| 0 | 899.992120 | 47.779512 | 16.461888 | 69795.748750 | 15.106077 | 49.976815 | 650.928019 | 227.554326 | 32.086225 | 41.815275 | ... | 0.418153 | 0.083044 | 0.333404 | 0.082676 | 0.082620 | 0.192408 | 3.013638 | 2.838915 | 2.932262 | 2.928272 |
| 1 | 896.391549 | 48.016901 | 15.484507 | 70393.005634 | 17.154930 | 48.602817 | 775.898592 | 302.385915 | 26.698592 | 17.912676 | ... | 0.179127 | 0.159577 | 0.334789 | 0.173127 | 0.153099 | 0.090366 | 3.078873 | 3.028169 | 3.081690 | 3.062911 |
| 2 | 899.285603 | 51.346079 | 17.731174 | 75126.315646 | 15.368709 | 50.088178 | 630.557160 | 182.542333 | 29.511120 | 79.102614 | ... | 0.791026 | 0.019423 | 0.151881 | 0.019844 | 0.017187 | 0.084881 | 3.010925 | 3.068279 | 3.154116 | 3.077773 |
3 rows × 32 columns
dfw_.drop('BMU', axis=1, inplace=True)
tsnemodel_somw = TSNE(random_state=42, perplexity=20).fit_transform(dfw_[wine_feats_dec])
tsne_value_somw = pd.DataFrame(tsnemodel_somw)
plot_clusters(tsne_value_somw, dfw_['wine_nodes'], 'T-SNE Visualization of Wine Segmentation Using SOM')
df_msv = df_nonoise[value_feats_mm].copy()
bandwidth = estimate_bandwidth(df_msv, quantile=.05, random_state=RANDOM_STATE, n_jobs=-1)
bandwidth
0.25442455179585705
msv = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=4)
ms_labels = msv.fit_predict(df_msv)
ms_n_clusters = len(np.unique(ms_labels))
print("Number of estimated clusters : %d" % ms_n_clusters)
df_msv['ms_value'] = ms_labels
Number of estimated clusters : 4
plot_clusters(tsne_value_somv, df_msv['ms_value'], 'T-SNE Visualization of Value Segmentation Using Mean Shift')
df_msw = df_nonoise[wine_feats_dec].copy()
bandwidth = estimate_bandwidth(df_msw, quantile=.1, random_state=RANDOM_STATE, n_jobs=-1)
bandwidth
0.20007293199294793
msw = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=4)
msw_labels = msw.fit_predict(df_msw)
ms_n_clusters = len(np.unique(msw_labels))
print("Number of estimated clusters : %d" % ms_n_clusters)
df_msw['ms_wine'] = msw_labels
Number of estimated clusters : 3
plot_clusters(tsne_value_somw, df_msw['ms_wine'], 'T-SNE Visualization of Value Segmentation Using Mean Shift')
df_dbv = df_nonoise[wine_feats_dec].copy()
# K-distance graph to find out the right eps value
neigh = NearestNeighbors(n_neighbors=10)
neigh.fit(df_dbv)
distances, _ = neigh.kneighbors(df_dbv)
distances = np.sort(distances[:, -1])
fig = plt.figure()
plt.plot(distances)
save_fig('Epsilon Plot for DBScan', fig)
plt.show()
# Perform DBSCAN clustering
dbscanv = DBSCAN(eps=0.12, min_samples=10, n_jobs=4)
dbscan_labels = dbscanv.fit_predict(df_dbv)
dbscan_n_clusters = len(np.unique(dbscan_labels))
print("Number of estimated clusters : %d" % dbscan_n_clusters)
df_dbv['db_value'] = dbscan_labels
Number of estimated clusters : 2
df_dbv.groupby(['db_value']).count()
| Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | Exotic_dec | |
|---|---|---|---|---|---|---|
| db_value | ||||||
| -1 | 421 | 421 | 421 | 421 | 421 | 421 |
| 0 | 9096 | 9096 | 9096 | 9096 | 9096 | 9096 |
plot_clusters(tsne_value_somv, df_dbv['db_value'], 'T-SNE Visualization of Value Segmentation Using DBSCAN')
df_db = df_nonoise[wine_feats_dec].copy()
# K-distance graph to find out the right eps value
neigh = NearestNeighbors(n_neighbors=10)
neigh.fit(df_db)
distances, _ = neigh.kneighbors(df_db)
distances = np.sort(distances[:, -1])
fig = plt.figure()
plt.plot(distances)
save_fig('Epsilon Plot for DBScan', fig)
plt.show()
# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.13, min_samples=10, n_jobs=4)
dbscan_labels = dbscan.fit_predict(df_db)
dbscan_n_clusters = len(np.unique(dbscan_labels))
print("Number of estimated clusters : %d" % dbscan_n_clusters)
df_db['db_wine'] = dbscan_labels
Number of estimated clusters : 2
df_db.groupby(['db_wine']).count()
| Dryred_dec | Sweetred_dec | Drywh_dec | Sweetwh_dec | Dessert_dec | Exotic_dec | |
|---|---|---|---|---|---|---|
| db_wine | ||||||
| -1 | 259 | 259 | 259 | 259 | 259 | 259 |
| 0 | 9258 | 9258 | 9258 | 9258 | 9258 | 9258 |
plot_clusters(tsne_value_somv, df_db['db_wine'], 'T-SNE Visualization of Wine Segmentation Using DBSCAN')